The Start Of It All...
#Change the width of the notebook
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
#Standard Libraries
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames
import random
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
#User counter to check frequencies (https://pymotw.com/2/collections/counter.html)
from collections import Counter
#Visualisation Libraries
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from pandas.plotting import scatter_matrix
import seaborn as sns
# Pretty display for notebooks
%matplotlib inline
# Import supplementary visualizations code visuals.py
import visuals as vs
#Modelling Libraries
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn import tree
from sklearn.metrics import fbeta_score,accuracy_score
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
sampleMode = 1
if sampleMode == 1:
file = "sample_model_data_protected_small5k.csv"
else:
file = "sample_model_data_protected_60k.csv"
# Load Company IP Intent Data (March Sample)
try:
full_data = pd.read_csv(file)
print("Company IP Intent dataset has {} samples with {} features each.".format(*full_data.shape))
except:
print("Dataset could not be loaded!")
print("Do not use excel UTF-8 format for csv, it breaks stuff...")
def getRandomIDXs(n,d):
N=len(d)
indices=[]
for i in range(0, n):
randomInt = (random.randint(0, N))
if randomInt not in indices:
indices.append(randomInt)
print("Sample indices selected - ", str(len(indices)))
return indices
def getRandomSample(n,d):
N=len(d)
n=round(n*N)
indices = getRandomIDXs(n,d)
sample = pd.DataFrame(d.reindex(indices), columns = d.keys()).reset_index(drop = True)
#sample = pd.DataFrame(d.loc[indices], columns = d.keys()).reset_index(drop = True)
print(sample.shape)
return sample
#Take a sample due to the size of the dataset here
data = getRandomSample(1.0,full_data)
#data = full_data
print("Chosen samples of Company Intent customers dataset:",str(len(data)))
data[:3]
data.dtypes
#Null fill values to prevent NaN issues
def zeroFillColumn(d):
cols = d.columns
for col in cols:
if is_numeric_dtype(d[col]):
d[col].fillna(0, inplace = True)
else:
d[col].fillna("Unknown", inplace = True)
return d
data = zeroFillColumn(data)
In this sections I will attempt to find correlations in my data with summary statistics and visualations to help guide the supervised learning
This dataset has been derived from a production system which may have too sparse datapoints for certain metrics OR just too unreliable due to poor matching between disparate datasets
Flag and remove early on for performance reasons mainly
# Display a description of the dataset
t=data.describe()
t.to_csv('summary_statistics.csv')
data.describe()
Ignore NAICS codes.
hitsSum/pageViewsSum/uniqueViewsSum/most of the iabCat hits - Massively varying between accounts. Highlighy right skewed, with the std way over the 75th percentile
#Group columns together for analysis
iabCat_hits_cols = [col for col in data.columns if 'hits_iabCat' in col]
summaryMetricCols = ['datesCount', 'domainsCount', 'hitsSum','pageViewsSum', 'uniqueViewsSum', 'clicks', 'clickDates']
labelCols = ['acct_id','hq_id','company','userDomain','city','region','countryCode','revenue_mil_usd','total_employees']
coninuousCols = iabCat_hits_cols + summaryMetricCols + ['revenue_mil_usd','total_employees']
boxplot = data.boxplot(column=summaryMetricCols,figsize=(15,15))
boxplot = data.boxplot(column=iabCat_hits_cols,figsize=(15,15))
#Get Outliers IDXs per feature column
def getOutLierIDXs(d):
#Intialise counters
outliers = []
featureDict = {}
typeDict = data.columns.to_series().groupby(data.dtypes).groups
# For each feature find the data points with extreme high or low values
for feature in d.keys():
#Break early if not a number value
if is_numeric_dtype(d[feature]):
#Calculate Q1 (25th percentile of the data) for the given feature
Q1 = np.percentile(d[feature],25)
#Calculate Q3 (75th percentile of the data) for the given feature
Q3 = np.percentile(d[feature],75)
#Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
step = (Q3 - Q1)*1.5
#Outlier if value step below lower quartile OR step above upper quartile
featureOutliers = d[(d[feature] <= Q1 - step) | (d[feature] >= Q3 + step)]
outlierIDXs = featureOutliers.index.tolist()
featureDict[feature] = len(featureOutliers)
print("Number of outliers detected for feature - " + str(feature) + " = " +str(len(featureOutliers)))
#Add to global list of outliersIDXs
outliers.extend(outlierIDXs)
return (outliers,featureDict)
#Remove any columns where all values are classed as outulier, a good few iabCat columns
res = getOutLierIDXs(data)
outliers = res[0]
featureDict = res[1]
#Clean dataset
datac = data.copy()
badBoys = []
for feature in featureDict.keys():
if (featureDict[feature]) == len(datac):
badBoys.append(feature)
print(str(feature) + "- Feature added to blacklist")
#Drop Bad columns
for feature in badBoys:
datac.drop([feature], axis = 1, inplace = True)
#Remove outlier function
def getMultiClassOutliers(d):
#Get outlier indexes per column
res = getOutLierIDXs(d)
outliers = res[0]
featureDict = res[1]
#Capture multi-feature outliers
multiOutliers=[]
countsDict = dict(Counter(outliers))
for idx in list(countsDict.keys()):
if countsDict[idx] > 1:
multiOutliers.append(idx)
print('Multi-Feature outlier found at index = ' + str(idx))
print('Total multi-Feature outliers found = ' + str(len(multiOutliers)) + ' out of sample of ' + str(len(d)))
return multiOutliers
#Re-run outlier detection and remove multi-feature outliers
outliersN = getMultiClassOutliers(datac)
#Remove outliers
print('Total outliers detected = '+str(len(outliersN)) + ' out of a total sample of ' + str(len(datac)))
good_data = datac.drop(datac.index[outliersN]).reset_index(drop = True)
print('Muli-class outliers removed, and noisey columns removed. Remaining rows = {}, remaining columns = {}'.format(*good_data.shape))
#Reset Metric Columns after blacklisting
iabCat_hits_cols = [col for col in good_data.columns if 'hits_iabCat' in col]
summary_metric_cols = ['datesCount', 'domainsCount', 'hitsSum','pageViewsSum', 'uniqueViewsSum', 'clicks', 'clickDates']
label_cols = ['acct_id','hq_id','company','userDomain','city','region','countryCode','revenue_mil_usd','total_employees']
continuous_cols = iabCat_hits_cols + ['revenue_mil_usd','total_employees']
#Ensure only columns still remaining
current_cols = good_data.columns
iabCat_hits_cols = sorted(list(set(iabCat_hits_cols) & set(current_cols)))
summary_metric_cols = list(set(summary_metric_cols) & set(current_cols))
label_cols = list(set(label_cols) & set(current_cols))
continuous_cols = sorted(list(set(continuous_cols) & set(current_cols)))
To get a better understanding of the customers and how their data will transform through the analysis, it would be best to select a few sample data points and explore them in more detail.
#Generate random samples to investigate quickly and visually
portion=0.005
samples = getRandomSample(portion,good_data)
sample_size=len(samples)
print("Chosen samples of Company Intent customers dataset:")
display(samples)
#Visualise Sample data
samples_for_plot = samples.copy()
samples_for_plot = samples_for_plot[summary_metric_cols]
#Add median plot as well
columnsN = len(samples_for_plot.columns)
samples_for_plot.loc[sample_size + 1] = good_data.median()
#Create lables
labels=[]
for i in range(0,sample_size):
name = 'Sample_'+str(i)
labels.append(name)
labels.append('Median')
#Plot graph
samples_for_plot.plot(kind='bar',figsize=(20,20))
plt.xticks(range(sample_size+1),labels)
plt.figure()
plt.show()
#For a better comparison, we will look at the heat map of the data
summary_data = samples[summary_metric_cols]
percentiles_summary_data = 100*summary_data.rank(pct=True)
plt.figure(figsize=(8,8))
sns.heatmap(percentiles_summary_data, annot=True)
#Correlation heatmap!
plt.figure(figsize=(10,10))
sns.heatmap(summary_data.corr(),annot=True)
#Look at breakdown by IAB Cat
continuous_data = good_data[continuous_cols]
#Expensive operation so run the calculations first and graph seperately
correlations = continuous_data.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(correlations,annot=True)
Quick analysis to see if there are any clear inference a supervised learning could generate from the data
#Prepare target value (category 19 is what we specialise in selling so lates try this one)
target = 'hits_iabCat_IAB_19'
#Returns a series for all the target values (same as keying a dict in kdb!)
target_data = continuous_data[target]
#Ravel series data as single column target
target_data = target_data.ravel()
#Drop target so we can use a feature
continuous_data.drop([target], axis = 1, inplace = True)
print("Wholesale customers dataset created to predict " + target + " has {} samples with {} features each.".format(*continuous_data.shape))
#Split the data into training and testing sets(0.25)
X_train, X_test, y_train, y_test = train_test_split(continuous_data, target_data, test_size=0.25, random_state=10)
#Create a decision tree regressor and fit it to the training set
regressor = tree.DecisionTreeRegressor(max_depth=25)
regressor = regressor.fit(X_train,y_train)
print('DecisionTreeRegressor fit to training data')
# TODO: Report the score of the prediction using the testing set
testPredictions = regressor.predict(X_test)
testPredictions = testPredictions
print('DecisionTreeRegressor test predictions ran')
#Score must be relevant to a regressor(r2 score used)
score = regressor.score(X_test, y_test)
print(score)
To get a better understanding of the dataset, we can construct a scatter matrix of each of the six product features present in the data. If you found that the feature you attempted to predict above is relevant for identifying a specific customer, then the scatter matrix below may not show any correlation between that feature and the others. Conversely, if you believe that feature is not relevant for identifying a specific customer, the scatter matrix might show a correlation between that feature and another feature in the data. Run the code block below to produce a scatter matrix.
# Produce a scatter matrix for each pair of features in the data
axes = scatter_matrix(continuous_data, alpha=0.75, figsize = (40,40), diagonal = 'kde')
corr = continuous_data.corr().values
for i, j in zip(*np.triu_indices_from(axes, k=1)):
axes[i, j].annotate("%.3f" %corr[i,j], (0.8, 0.8), xycoords='axes fraction', ha='center', va='center')
#Limit to only continuous features
bench_data = good_data[continuous_cols]
bench_samples = samples[continuous_cols]
#Apply PCA by fitting the good data with the same number of dimensions as features
components = np.unique(bench_data.keys())
nc = componentsN = len(components)
display(components)
pca = PCA(n_components=componentsN).fit(bench_data)
print('Total dataset components = ' + str(componentsN))
#Transform the good data using the PCA fit above
reduced_data = pca.transform(bench_data)
#Transform samples using the PCA fit above
pca_samples = pca.transform(bench_samples)
# Generate PCA results plot
pca_results = vs.pca_results(bench_data, pca)
#Transform samples using the PCA fit above
pca_samples = pca.transform(bench_samples)
#Create a DataFrame for the reduced data
column_names = []
for i in range(1,nc+1):
name = 'Dimension '+str(i)
column_names.append(name)
reduced_data = pd.DataFrame(reduced_data, columns = column_names)
#https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
#Apply your clustering algorithm of choice to the reduced data
clusterer = KMeans(n_clusters=3,random_state=1).fit(bench_data)
#Predict the cluster for each data point
preds = clusterer.predict(bench_data)
#Find the cluster centers (not applicable to DB scan)
centers = clusterer.cluster_centers_
#display(centers)
#Calculate the mean silhouette coefficient for the number of clusters chosen
score = silhouette_score(bench_data, preds)
print(score)
# Create a biplot
vs.biplot(bench_data, reduced_data, pca)
#Example https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#sphx-glr-auto-examples-cluster-plot-dbscan-py
clustersN=3
#Apply your clustering algorithm of choice to the reduced data
clusterer = KMeans(n_clusters=clustersN,random_state=1).fit(reduced_data)
#Predict the cluster for each data point
preds = clusterer.predict(reduced_data)
#Find the cluster centers (not applicable to DB scan)
centers = clusterer.cluster_centers_
#display(centers)
#Calculate the mean silhouette coefficient for the number of clusters chosen
score = silhouette_score(reduced_data, preds)
print(score)
Appears to be a very good scores here. But looking at the visualisation this is false positive as due to the distribution of data it is simply placing the vast majority of clusters into one cluster that is far away from the other 2 clusters
# Display the results of the clustering from implementation
vs.cluster_results(reduced_data, preds, centers, pca_samples)
#Limit to only continuous features
good_data = good_data[continuous_cols]
samples = samples[continuous_cols]
#Scale the data using the natural logarithm
log_data = good_data.apply(lambda x: np.log(x+1))
log_samples = samples.apply(lambda x: np.log(x+1))
# Produce a scatter matrix for each pair of newly-transformed features
axes = scatter_matrix(log_data, alpha=0.75, figsize = (40,40), diagonal = 'kde')
corr = log_data.corr().values
for i, j in zip(*np.triu_indices_from(axes, k=1)):
axes[i, j].annotate("%.3f" %corr[i,j], (0.8, 0.8), xycoords='axes fraction', ha='center', va='center')
#Not quite normarmally distrubuted for a lot of columns, lets try the boxco transformation
#Another option in that case is the BoxCox transformation. WE CAN'T AS WE HAVE 0 VALUES PRESENT
#boxcox_data = reduced_data.apply(lambda x: boxcox(x)[0])
#pd.scatter_matrix(boxcox_data, alpha = 0.3, figsize = (14,10), diagonal = 'kde');
#Apply PCA by fitting the good data with the same number of dimensions as features
components = np.unique(log_data.keys())
componentsN = len(components)
display(components)
pca = PCA(n_components=componentsN).fit(log_data)
print('Total dataset components = ' + str(componentsN))
#Transform the good data using the PCA fit above
reduced_data = pca.transform(log_data)
#Transform log_samples using the PCA fit above
pca_samples = pca.transform(log_samples)
# Generate PCA results plot
pca_results = vs.pca_results(log_data, pca)
print(pca_results['Explained Variance'].cumsum())
# Display sample log-data after having a PCA transformation applied
display(pd.DataFrame(np.round(pca_samples, sample_size), columns = pca_results.index.values))
#Apply PCA by fitting the good data with only 11 dimensions (based on ~90% of variance explained by 11 components above)
nc = 10
pca = PCA(n_components=nc).fit(log_data)
#Transform the good data using the PCA fit above
reduced_data = pca.transform(log_data)
#Transform log_samples using the PCA fit above
pca_samples = pca.transform(log_samples)
#Create a DataFrame for the reduced data
column_names = []
for i in range(1,nc+1):
name = 'Dimension '+str(i)
column_names.append(name)
reduced_data = pd.DataFrame(reduced_data, columns = column_names)
# Display sample log-data after applying PCA transformation in two dimensions
#display(reduced_data)
A biplot is a scatterplot where each data point is represented by its scores along the principal components. The axes are the principal components. In addition, the biplot shows the projection of the original features along the components. A biplot can help us interpret the reduced dimensions of the data, and discover relationships between the principal components and original features.
# Create a biplot
vs.biplot(log_data, reduced_data, pca)
Investigate if feature scaling and dimensionality reduction aids in the creation of better clusters
#Example https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#sphx-glr-auto-examples-cluster-plot-dbscan-py
def applyKmeans(k,data):
#Apply clustering algorithm to the data
clusterer = KMeans(n_clusters=k,random_state=1).fit(data)
#Predict the cluster for each data point
preds = clusterer.predict(reduced_data)
#Find the cluster centers (not applicable to DB scan)
centers = clusterer.cluster_centers_
#Calculate the mean silhouette coefficient for the number of clusters chosen
score = silhouette_score(reduced_data, preds)
print("K="+str(k) + ", Silhouette Score - "+ str(score))
#Investigate what is the optimal value for k
clustersN=[2,3,4,5,10,15,20,25,30,35,40,45,50,60,70,80,90,100]
for i in clustersN:
applyKmeans(i,reduced_data)
k=2
#Apply clustering algorithm to the data
clusterer = KMeans(n_clusters=k,random_state=1).fit(reduced_data)
#Predict the cluster for each data point
preds = clusterer.predict(reduced_data)
#Find the cluster centers (not applicable to DB scan)
centers = clusterer.cluster_centers_
#Calculate the mean silhouette coefficient for the number of clusters chosen
score = silhouette_score(reduced_data, preds)
# Display the results of the clustering from implementation
vs.cluster_results(reduced_data, preds, centers, pca_samples)
Applying the inverse operations to the data following transformations
#Inverse transform the centers
log_centers = pca.inverse_transform(centers)
#Exponentiate the centers and remove the 1 from earlier (used to prevent log 0 error)
true_centers = np.exp(log_centers) - 1
# Display the true centers
segments = ['Segment {}'.format(i) for i in range(0,len(centers))]
true_centers = pd.DataFrame(np.round(true_centers), columns = good_data.keys())
true_centers.index = segments
display(true_centers)
true_centers.to_csv('summary_clusters.csv')
This is definitely an improvement on the benchmark model as we can at least see some form of clustering that might proivide useful insights to marketers.
Investigate alternative clustering methods where the number of clusters is found based on distance (epsilon)
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
sklearn.cluster.DBSCAN(eps=0.5, min_samples=5, metric=’euclidean’, metric_params=None, algorithm=’auto’, leaf_size=30, p=None, n_jobs=None)[source]
# Compute DBSCAN
def applyDBSCAN(epsilon,data):
db = DBSCAN(eps=epsilon, min_samples=10).fit(data)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
#Results
if (n_noise_ == len(data)) or len(set(labels)) <=1:
score=0
else:
score = metrics.silhouette_score(data, labels)
print("Epsilon= " + str(epsilon)+ ". Silhouette Score= "+str(score) + ". Clusters= "+str(n_clusters_)+" with an estimated number of noise points = "+ str(n_noise_))
epsilons=[0.0001, 0.001,0.01,1,2,3,4,5,6,7,8,9,10,20,30,40,50]
for i in epsilons:
applyDBSCAN(i,reduced_data)
Only a single cluster can be derived at best. This further validates the idea that there are not truly and distinct cluster present in the dataset
Using k-means to try and determine if there are distinct cluster of companies to be found within this dataset, based on their web activty on website of different categorisations, as per the IAB taxonomy, there appears to be no distinct behaviour of companies. A sillouhette score of 0.19 highlights this along with the clustering visualisation above which clusters with large overlaps.
Next Steps for the Business:
There are clearly some serious data quality issue here. The business needs to decide if it is worth investing in monitioring any of the blacklisted categories or a decision should be made to stop tracking activity on blacklisted IAB categorised website
Click data needs to be increased in order to drive any significant machine learning inferences going forward. Right now we not have significant data accross all accounts. I'd suggest we invest in serving display ads to all companies in our database to be building up a baseline for activity and help drive future analysis.
There are some serious outliers, which should be taken into consideration with the existing scoring models, as they could skew the entire accounts list. Highly likely they will always bubble to the top of client accounts
Further Analysis: